Audio Embedding¶
- Audio Preprocessing: Load, process and extract embeddings from various audio samples using an open-source library such as scipy or librosa.
- Embedding Creation: Convert the audio signals into embeddings using a pre-trained model or a simple RNN-based architecture.
- Vector Database: Use faiss-cpu to store and search for audio embeddings efficiently.
- RAG (Retrieval-Augmented Generation): Implement the query mechanism to retrieve the most similar audio embeddings.
- Visualization: Display tables for audio metadata and embeddings, along with graphs for original and predicted audio 5.0 | 0.167
In [ ]:
%pip install -q numpy matplotlib scipy faiss-cpu librosa pandas
Python interpreter will be restarted. ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. petastorm 0.11.4 requires pyspark>=2.1.0, which is not installed. pandas-profiling 3.1.0 requires joblib~=1.0.1, but you have joblib 1.4.2 which is incompatible. mleap 0.20.0 requires scikit-learn<0.23.0,>=0.22.0, but you have scikit-learn 1.6.1 which is incompatible. Python interpreter will be restarted.
In [ ]:
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import faiss
import pandas as pd
from scipy.signal import spectrogram
# Path to the audio files directory
audio_directory = "/curated/AudioStore/audios/"
# Function to get list of audio files in the directory
def get_audio_files(directory):
return [f for f in os.listdir(directory) if f.endswith(".mp3")]
# Function to extract spectrogram as embeddings
def extract_spectrogram_embedding(audio_file):
# Load audio file using librosa
y, sr = librosa.load(audio_file, sr=None)
# Compute the spectrogram
f, t, Sxx = spectrogram(y, sr)
# Convert to log scale for better feature representation
Sxx = np.log(Sxx + 1e-7)
# Take mean across time dimension to reduce the dimensionality
embedding = np.mean(Sxx, axis=1)
return embedding
# Get all audio files from the specified directory
audio_files = get_audio_files(audio_directory)
# Extract embeddings from each audio file
embeddings = [extract_spectrogram_embedding(os.path.join(audio_directory, file)) for file in audio_files]
# Create FAISS index to store and retrieve embeddings
def create_faiss_index(embeddings):
embeddings = np.array(embeddings).astype('float32')
dim = embeddings.shape[1] # The dimensionality of the embeddings
index = faiss.IndexFlatL2(dim) # Using L2 distance metric
index.add(embeddings) # Add embeddings to the index
return index
# Create the FAISS index
faiss_index = create_faiss_index(embeddings)
# Function to retrieve similar audio based on a query
def retrieve_similar_audio(query_file, faiss_index, k=2):
query_embedding = extract_spectrogram_embedding(query_file)
query_embedding = np.array(query_embedding).astype('float32').reshape(1, -1)
distances, indices = faiss_index.search(query_embedding, k) # Retrieve top-k nearest neighbors
return indices, distances
# Query example: Assume the user asks for a specific audio file (e.g., "dog_bark.mp3")
query_audio_file = os.path.join(audio_directory, "duskwolf_101348.mp3")
indices, distances = retrieve_similar_audio(query_audio_file, faiss_index, k=2)
# Function to display audio table in markdown format
def display_audio_table(data, title):
df = pd.DataFrame(data, columns=['Index', 'Audio', 'Embedding (First 5 embeddings)'])
print(f"\n{title}")
print(df.to_markdown(index=False)) # Use markdown to get nice table formatting
# Display the results in a table
def display_results(audio_files, embeddings, indices, query_audio_file):
#print(f"\nQuery Audio File: {query_audio_file}")
# Prepare data for original audio table (first 5 embeddings)
original_audio_data = [(i, audio_files[i], embeddings[i][:5]) for i in range(len(audio_files))]
display_audio_table(original_audio_data, "Original Audio Files and Embeddings:")
# Display User Query Audio Waveform
plot_query_audio_waveform(query_audio_file)
# Prepare data for predicted audio table (first 5 embeddings)
predicted_audio_files = [audio_files[idx] for idx in indices[0]]
predicted_embeddings = [embeddings[idx] for idx in indices[0]]
predicted_audio_data = [(i, predicted_audio_files[i], predicted_embeddings[i][:5]) for i in range(len(predicted_audio_files))]
display_audio_table(predicted_audio_data, "Predicted Audio Files and Embeddings:")
# Display Predicted Audio Waveforms side by side
plot_predicted_audio_waveforms(audio_files, indices)
# Plot the User Query Audio Waveform
def plot_query_audio_waveform(query_audio_file):
# Extract just the file name (no path)
query_audio_name = os.path.basename(query_audio_file)
# Plot the query audio waveform
y, sr = librosa.load(query_audio_file, sr=None)
plt.figure(figsize=(6, 4)) # Adjust figure size
plt.plot(y)
plt.title(f"User Query Audio Waveform: {query_audio_name}") # Use only file name
plt.xlabel("Sample Number")
plt.ylabel("Amplitude")
plt.show()
# Plot the Predicted Audio Waveforms side by side
def plot_predicted_audio_waveforms(audio_files, indices):
# Plot predicted audio waveforms side by side
fig, axes = plt.subplots(1, len(indices[0]), figsize=(12, 4)) # Adjust the number of subplots based on k (number of predicted files)
if len(indices[0]) == 1:
axes = [axes] # To handle the case where there's only one predicted audio
for i, idx in enumerate(indices[0]):
retrieved_audio_file = audio_files[idx]
retrieved_audio_name = os.path.basename(retrieved_audio_file) # Extract file name only
y, sr = librosa.load(os.path.join(audio_directory, retrieved_audio_file), sr=None)
axes[i].plot(y)
axes[i].set_title(f"Predicted Audio Waveform: {retrieved_audio_name}") # Use only file name
axes[i].set_xlabel("Sample Number")
axes[i].set_ylabel("Amplitude")
plt.tight_layout() # To ensure that the subplots don't overlap
plt.show()
# Display results
display_results(audio_files, embeddings, indices, query_audio_file)
Original Audio Files and Embeddings: | Index | Audio | Embedding (First 5 embeddings) | |--------:|:-----------------------------------|:--------------------------------------------------------------| | 0 | diesel_mercedes_190_d_33940.mp3 | [-15.053708 -12.10706 -12.368397 -13.738781 -14.303338] | | 1 | dog1_small_barking_angirly.mp3 | [-15.80812 -15.75914 -15.568337 -15.184358 -14.459221] | | 2 | dog2_small_dog_barking.mp3 | [-15.837855 -15.630339 -15.422517 -14.892724 -15.095776] | | 3 | dog3_barking.mp3 | [-15.591373 -15.58906 -15.320949 -14.866169 -14.144432] | | 4 | duskwolf_101348.mp3 | [-15.414704 -15.892582 -15.705197 -15.354351 -14.447784] | | 5 | truck1_Hyundai_Tractor_Engine.mp3 | [-13.268114 -9.427747 -11.069034 -11.546685 -9.743669] | | 6 | truck2_Dododge.mp3 | [-15.470654 -12.924455 -12.648085 -13.403902 -14.288434] | | 7 | truck_diesel_07dodge_rev_98278.mp3 | [-15.307348 -13.277036 -12.684735 -13.181681 -14.059005] | | 8 | wol3_wolves.mp3 | [-16.056688 -16.060814 -15.998393 -15.741171 -15.141277] | | 9 | wolf1_howling.mp3 | [-15.400748 -14.87836 -13.977464 -13.397399 -15.081363] | | 10 | wolf2_howling_wolves.mp3 | [-15.7981415 -15.991037 -15.964015 -15.954524 -15.771357 ] |
Predicted Audio Files and Embeddings: | Index | Audio | Embedding (First 5 embeddings) | |--------:|:--------------------|:---------------------------------------------------------| | 0 | duskwolf_101348.mp3 | [-15.414704 -15.892582 -15.705197 -15.354351 -14.447784] | | 1 | dog3_barking.mp3 | [-15.591373 -15.58906 -15.320949 -14.866169 -14.144432] |